1 Importing the Data

data_pre<-data.frame(read.csv("loan_approval_dataset.csv"))
data<-data_pre

2 Structure of the data

str(data)
## 'data.frame':    4269 obs. of  13 variables:
##  $ loan_id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ no_of_dependents        : int  2 0 3 3 5 0 5 2 0 5 ...
##  $ education               : chr  " Graduate" " Not Graduate" " Graduate" " Graduate" ...
##  $ self_employed           : chr  " No" " Yes" " No" " No" ...
##  $ income_annum            : int  9600000 4100000 9100000 8200000 9800000 4800000 8700000 5700000 800000 1100000 ...
##  $ loan_amount             : int  29900000 12200000 29700000 30700000 24200000 13500000 33000000 15000000 2200000 4300000 ...
##  $ loan_term               : int  12 8 20 8 20 10 4 20 20 10 ...
##  $ cibil_score             : int  778 417 506 467 382 319 678 382 782 388 ...
##  $ residential_assets_value: int  2400000 2700000 7100000 18200000 12400000 6800000 22500000 13200000 1300000 3200000 ...
##  $ commercial_assets_value : int  17600000 2200000 4500000 3300000 8200000 8300000 14800000 5700000 800000 1400000 ...
##  $ luxury_assets_value     : int  22700000 8800000 33300000 23300000 29400000 13700000 29200000 11800000 2800000 3300000 ...
##  $ bank_asset_value        : int  8000000 3300000 12800000 7900000 5000000 5100000 4300000 6000000 600000 1600000 ...
##  $ loan_status             : chr  " Approved" " Rejected" " Rejected" " Rejected" ...
NA_values <- sum(is.na(data))
NA_values
data<- subset(data_pre, select = -c(loan_id))
data$education <- as.factor(data$education)
data$self_employed<-as.factor(data$self_employed)
data$loan_status<-as.factor(data$loan_status)

str(data)
summary(data)

3 EDA

ggplot(data, aes(y = no_of_dependents, x = loan_status, fill = loan_status)) +
  geom_boxplot(binwidth = 0.5, color = "black", alpha = 0.9) +
  labs(
    title = "Box plot of number of dependents and loan status",
    x = "Loan Status",
    y = "Number of Dependents"
  ) +
  scale_fill_manual(values = c("#93C572", "#4682B4"))+theme_minimal()

data <- data %>%
  mutate(self_employed = str_trim(self_employed))
data <- data %>%
  mutate(education = str_trim(education))
data <- data %>%
  mutate(loan_status = str_trim(loan_status))
data$education<-as.factor(data$education)
data$self_employed<-as.factor(data$self_employed)
data$loan_status<-as.factor(data$loan_status)
class(data$self_employed)
class(data$luxury_assets_value)
class(data$income_annum)
class(data$loan_status)
str(data)
data$self_employed <- factor(data$self_employed, levels = c("Yes", "No", "Other"))

ggplot(data, aes(x =loan_status, fill = self_employed)) +
  geom_bar(position = "dodge", stat = "count") +
  labs(title = "Stacked Bar between Loan Status and Self Employment",
       x = "Loan Status",
       y = "Count",
       fill = "self_employed") + scale_fill_manual(values = c("Yes" = "#93C572", "No" = "#4682B4", "Other" = "gray")) +
  theme_minimal()

data1 <- data
data1$self_employed <- as.integer(data$self_employed == "Yes")
data1$loan_status <- as.integer(data$loan_status == "Approved")
data1$education <- as.integer(data$education == "Graduate")
ggplot(data, aes(x = loan_term,fill=loan_status)) +
  geom_density() +
  theme_bw() +
  theme() +
  labs(
    title = "Density plot of Loan Term based on Loan Status",
    x = "Loan Term (Years)",
    y = "Density of the Applicants"
  )+scale_fill_manual(values = c("#93C572", "#4682B4")) + theme_minimal()

3.0.1 Scatter Plot between Cibil Score and Loan Amount

ggplot(data, aes(x= cibil_score, y = loan_amount, color = loan_status)) +
  geom_point() +
  labs(title = "Scatter Plot of CIBIL Score vs Loan Amount",
       x = "CIBIL Score",
       y = "Loan Amount") +
  scale_color_manual(values = c("#93C572", "#4682B4")) + scale_x_continuous(breaks = seq(min(data$cibil_score), max(data$cibil_score), by = 50))+scale_y_continuous(labels = comma_format(scale = 1e-6,suffix = "M"), breaks = seq(0, 35000000, by = 5000000)) + theme_minimal()

ggplot(data, aes(x= residential_assets_value, y = loan_amount, color = loan_status)) +
  geom_point() +
  labs(title = "Scatter Plot of CIBIL Score vs Loan Amount",
       x = "Residential Assets value",
       y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()

ggplot(data, aes(x= commercial_assets_value, y = loan_amount, color = loan_status)) +
  geom_point() +
  labs(title = "Scatter Plot of Commercial assets value vs Loan Amount",
       x = "Commercial assets value",
       y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()

ggplot(data, aes(x= luxury_assets_value, y = loan_amount, color = loan_status)) +
  geom_point() +
  labs(title = "Scatter Plot of luxury assets value vs Loan Amount",
       x = "Luxury Assets value",
       y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()

ggplot(data, aes(x= bank_asset_value, y = loan_amount, color = loan_status)) +
  geom_point() +
  labs(title = "Scatter Plot of bank asset value vs Loan Amount",
       x = "bank Assets value",
       y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()

ggplot(data, aes(x= income_annum, y = loan_amount, color = loan_status)) +
  geom_point() +
  labs(title = "Scatter Plot of income annum vs Loan Amount",
       x = "Income annum",
       y = "Loan Amount") +scale_color_manual(values = c("#93C572", "#4682B4")) + theme_minimal()

ggplot(data, aes(x = self_employed, y = cibil_score, fill=loan_status)) +
  geom_boxplot(outlier.shape = NA) +
  labs(title = "Box plot between loan status and cibil score",
       x = "Self Employed",
       y = "cibil score")+
  scale_fill_manual(values = c("Approved" = "#93C572", "Rejected" = "#4682B4"))+
  theme_minimal()

ggplot(data, aes(x = bank_asset_value, fill = loan_status)) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Bank Assets grouped by Loan Status",
       x = "Bank Assets",
       y = "Density")+scale_fill_manual(values = c("Approved" = "#93C572", "Rejected" = "#4682B4"))+
  theme_minimal()+scale_x_continuous(labels = comma_format(scale = 1e-7,suffix = "M"))

ggplot(data, aes(x = cibil_score, fill = loan_status)) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Cibil Score grouped by Loan Status",
       x = "CIBIL Score",
       y = "Density")+scale_fill_manual(values = c("Approved" = "#93C572", "Rejected" = "#4682B4"))+
  theme_minimal()

x <- cor(data1)
corrplot(x, type = "full", tl.cex = 0.7, method = "color", col = colorRampPalette(brewer.pal(6, "PuOr"))(100))

4 STATISTICAL TEST

4.1 T-Test on Loan Status (Approval/Rejection) and Cibil Score

## 
##  Welch Two Sample t-test
## 
## data:  Approved$cibil_score and rejected$cibil_score
## t = 88, df = 4263, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  268 280
## sample estimates:
## mean of x mean of y 
##       703       429

Null Hypothesis (\(H_{0}\)): CIBIL score has no significant association with loan status.

Alternate Hypothesis (\(H_{A}\)): CIBIL score has significant association with loan status.

The p-value \(0\), is very less than the standard alpha value of 0.05, hence, we reject the NULL hypothesis and conclude that CIBIL score has significant association with the probability of loan approval.

4.2 Chi-square test between Education and Loan Status

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  c
## X-squared = 0.08, df = 1, p-value = 0.8

Null Hypothesis (\(H_{0}\)): Education level and loan status are independent of each other.

Alternate Hypothesis (\(H_{A}\)): Education level and loan status are dependent on each other.

Education level and loan status have a high p-value of \(0.772\). Thus, we cannot reject null hypothesis and by therefore accepting the null hypothesis, we conclude that the education level of an applicant has no significant impact on loan approval.

4.3 T-Test between Loan Status and Bank Asset Value

## 
##  Welch Two Sample t-test
## 
## data:  Approved$bank_asset_value and rejected$bank_asset_value
## t = -0.4, df = 3453, p-value = 0.7
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -245677  154809
## sample estimates:
## mean of x mean of y 
##   4959526   5004960

Null Hypothesis (\(H_{0}\)): Bank asset value and loan status are independent of each other.

Alternative Hypothesis (\(H_{A}\)): Bank asset value and loan status are dependent on each other.

Education level and loan status have a high p-value of \(0.656\). Thus, we cannot reject the null hypothesis. We can therefore state that bank asset value and loan status are independent of each other and are not significantly associated.

4.4 T-Test between Loan Status and Residential Assets Value

## 
##  Welch Two Sample t-test
## 
## data:  Approved$residential_assets_value and rejected$residential_assets_value
## t = -0.9, df = 3400, p-value = 0.3
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -595310  209937
## sample estimates:
## mean of x mean of y 
##   7399812   7592498

Null Hypothesis (\(H_{0}\)): There is no significant association between the values of residential asset and loan approval status.

Alternative Hypothesis (\(H_{A}\)): There is a significant association between the values of residential asset and loan approval status. With a p-value of \(0.348\), we cannot reject the null hypothesis and thus, we conclude from the null hypothesis that there exists no significant association between residential assets value and loan status.

4.5 T-test between number of dependents and loan status

## 
##  Welch Two Sample t-test
## 
## data:  Approved$no_of_dependents and rejected$no_of_dependents
## t = -1, df = 3400, p-value = 0.2
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1683  0.0416
## sample estimates:
## mean of x mean of y 
##      2.47      2.54

Null Hypothesis (\(H_{0}\)): There is no significant association between the number of dependents and loan approval status.

Alternative Hypothesis (\(H_{A}\)): There is a significant association between the number of dependents and loan approval status.

With a p-value of \(0.237\), we cannot reject the null hypothesis and thus, by accepting the null hypothesis, we can say that there exists no significant association between number of dependents and loan status.

4.6 T-test between luxury assets value and loan status

## 
##  Welch Two Sample t-test
## 
## data:  a$luxury_assets_value and r$luxury_assets_value
## t = -1, df = 3442, p-value = 0.3
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -851752  271073
## sample estimates:
## mean of x mean of y 
##  15016604  15306944

Null Hypothesis (\(H_{0}\)): The luxury assets value of an applicant has no significant association with their loan status.

Alternate Hypothesis (\(H_{A}\)): The luxury assets value of an applicant has significant association with their loan status.

With a p-value of \(0.311\), we cannot reject the null hypothesis and thus, we conclude that the luxury assets value of an applicant has no significant association with their loan status.

## Pearson Correlation Coefficient: 0.00844
## p-value: 0.582

The small value of the Pearson correlation coefficient \(0.008\) suggests a weak relationship between loan_term and loan_amount. The high p-value \(0.582\) indicates that the resultant correlation is not statistically significant.

5 Model Selection

5.0.1 Regression problem

library("leaps")
 
reg.best10 <- regsubsets(loan_amount~. , data = data, nvmax = 10, nbest = 2, method = "exhaustive")  

plot(reg.best10, scale = "adjr2", main = "Adjusted R^2")

plot(reg.best10, scale = "r2", main = "R^2")

# In the "leaps" package, we can use scale=c("bic","Cp","adjr2","r2")
plot(reg.best10, scale = "bic", main = "BIC")

plot(reg.best10, scale = "Cp", main = "Cp")

summary(reg.best10)

The regsubsets() selection method aims to find the best subset of predictor variables that minimizes or maximizes a chosen criterion, such as Adjusted R-squared (adjr2), R-squared (r2), Bayesian Information Criterion (BIC), or Mallows’ Cp.

In case of the Adjusted R-squared plot, the best possible set of predictors are found to be: no_of_dependents, loan_term, income_annum, commercial_assests_value, cibil_score and loan_status. In case of the R-squared plot, the best possible set of predictors are found to be: no_of_dependents, loan_term, income_annum, commercial_assests_value, cibil_score and loan_status. From the BIC plot we can observe that the best possible set of predictors are found to be: no_of_dependents, loan_term, income_annum, residential_assets_value, commercial_assests_value, cibil_score and loan_status. From the Cp Mallow plot we can observe that the best possible set of predictors are found to be: no_of_dependents, income_annum, residential_assets_value, commercial_assests_value, cibil_score and loan_status.

summaryRegForward = summary(reg.best10)
# Adjusted R2
car::subsets(reg.best10, statistic="adjr2", legend = FALSE, min.size = 7, main = "Adjusted R^2 Plot")

From the Adjusted R-squared based statistic plot, the most suitable set of predictors are found to be: no_of_dependents, loan_term, income_annum, commercial_assests_value, cibil_score, commercial_assests_value, luxury_assests_value and loan_status.

subsets(reg.best10, statistic="cp", legend = FALSE, min.size = 4, main = "Mallow Cp Plot")
abline(a = 1, b = 1, lty = 3) 

The most relevant predictors from the Mallow Cp plot is found to be no_of_dependents, income_annum, commercial_assests_value, cibil_score and loan_status.

5.0.2 Classification Problem

library("bestglm")
res.bestglm <- bestglm(Xy = data, family = binomial,
            IC = "AIC",                 
            method = "exhaustive")
summary(res.bestglm)
res.bestglm$BestModels
summary(res.bestglm$BestModels)

6 Model Creation

In our comprehensive analysis, we adopt a dual-pronged approach to enhance the predictive capabilities of our model. Specifically, we employ both regression and classification techniques to predict distinct aspects of the loan application process—loan amount and loan status, respectively.

6.1 Train Test Split

The dataset was initially explored to understand the distribution of the target variable loan_status This binary classification variable represents whether a loan was approved or not. To ensure model generalization, the data was then split into training (80%) and test (20%) sets using a seed for reproducibility.

table(data$loan_status)
table(data$loan_status)[2] / sum(table(data$loan_status))


set.seed(1)
data_train_rows = sample(1:nrow(data),     
                              round(0.8 * nrow(data), 0),   
                              replace = FALSE)      

length(data_train_rows) / nrow(data)

data_train = data[data_train_rows, ]  
data_test = data[-data_train_rows, ]  

nrow(data_train)
nrow(data_test)

6.2 Regression

6.2.1 Linear Regression

A linear regression model was constructed using the lm() function in R, predicting loan_amount based on various features, including no_of_dependents, loan_term, income_annum, commercial_assets_value, cibil_score, and loan_status.

model<-lm(loan_amount~no_of_dependents+loan_term+income_annum+commercial_assets_value+cibil_score+loan_status,data=data)
summary(model)

ezids::xkabledply(model,title = "Summary of loan amount prediction")
ezids::xkablevif(model)

The summary statistics and variance inflation factor (VIF) were analyzed for insights. which gives us values lesser than 3 which means there is no multicollinearity in our features.

6.2.2 Results

model <- lm(loan_amount ~ no_of_dependents + loan_term + income_annum + 
            commercial_assets_value + cibil_score + loan_status, data = data_train)

train_predictions <- predict(model, newdata = data_train)
test_predictions <- predict(model, newdata = data_test)
plot_data <- data.frame(Actual = data_test$loan_amount, Predicted = test_predictions)


train_r_squared <- cor(data_train$loan_amount, train_predictions)^2
cat("Training R-squared:", train_r_squared, "\n")
## Training R-squared: 0.862
test_r_squared <- cor(data_test$loan_amount, test_predictions)^2
cat("Testing R-squared:", test_r_squared, "\n")
## Testing R-squared: 0.862
ggplot(plot_data, aes(x = Actual, y = Predicted)) +
  geom_point(color="#93C572") +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Actual vs Predicted Values",
       x = "Actual Values",
       y = "Predicted Values")+scale_x_continuous(labels = comma_format(scale = 1e-6,suffix = "M"), breaks = seq(0, 35000000, by = 5000000))+scale_y_continuous(labels = comma_format(scale = 1e-6,suffix = "M"), breaks = seq(0, 35000000, by = 5000000))+theme_minimal()

The scatter plot depicts the relationship between actual and predicted loan amounts, with a dashed red line marking the ideal prediction scenario. R-squared values of 0.862 for both training and testing highlight the model’s robust explanatory power and generalization to unseen data.

6.3 Classification

6.3.1 Logistic Regression

This study employs logistic regression to build a predictive model for loan status based on key features in a given dataset. The logistic regression model is constructed using the glm() function in R, with an emphasis on variables such as the number of dependents, annual income, loan amount, loan term, credit score (CIBIL score), luxury assets value, and bank assets value. The summary output of the model is analyzed to assess the significance and impact of each predictor on loan status.

6.3.2 Model building

Logit <- glm(loan_status ~ no_of_dependents  + income_annum+loan_amount+loan_term+cibil_score+luxury_assets_value+bank_asset_value, data = data_train, family = "binomial")
summary_output<-summary(Logit)
summary_output

The coefficients table reveals the estimated effects of each predictor on the log-odds of loan approval. Key findings include:

The intercept has a substantial positive effect on the log-odds. Variables such as loan_term and CIBIL_score significantly impact loan approval, as indicated by their respective z-values and low p-values. The no_of_dependents, income_annum, loan_amount, luxury_assets_value, and bank_asset_value show minimal impact on loan approval.

6.3.3 Feature Importance

ezids::xkabledply(Logit, title =" Summary of logistic Regression for loan status")
Summary of logistic Regression for loan status
Estimate Std. Error z value Pr(>|z|)
(Intercept) 11.2729 0.4762 23.671 0.000
no_of_dependents 0.0066 0.0385 0.171 0.864
income_annum 0.0000 0.0000 5.512 0.000
loan_amount 0.0000 0.0000 -6.715 0.000
loan_term 0.1491 0.0127 11.718 0.000
cibil_score -0.0246 0.0009 -26.587 0.000
luxury_assets_value 0.0000 0.0000 -1.353 0.176
bank_asset_value 0.0000 0.0000 -1.097 0.273
expcoeff = exp(coef(Logit))
# expcoeff
ezids::xkabledply( as.table(expcoeff), title = "Exponential of coefficients in Logit Reg" )
Exponential of coefficients in Logit Reg
x
(Intercept) 7.87e+04
no_of_dependents 1.01e+00
income_annum 1.00e+00
loan_amount 1.00e+00
loan_term 1.16e+00
cibil_score 9.76e-01
luxury_assets_value 1.00e+00
bank_asset_value 1.00e+00

Feature importance summary:

  • The intercept is notably high, serving as a baseline for loan approval odds.
  • Number of dependents has minimal impact (non-significant).
  • Annual income and loan amount show limited influence on loan approval odds (coefficient of 1.00).
  • Loan term has a substantial positive impact on approval odds (16% increase per unit).
  • Higher CIBIL scores correspond to lower odds of loan approval.
  • Luxury assets and bank assets show minimal impact on approval odds.

6.3.4 Data imbalance

# prediction
data_train$prediction <- predict( Logit, newdata = data_train, type = "response" )
data_test$prediction  <- predict( Logit, newdata = data_test , type = "response" )

# distribution of the prediction score grouped by known outcome
ggplot( data_train, aes( prediction, color = as.factor(loan_status) ) ) + 
geom_density( size = 1 ) +
ggtitle( "Training Set's Predicted Score" )+ labs(color = "Loan Status") 

  • Examining the data imbalance graph reveals that the distribution of approved and rejected candidates is both left and right skewed. Consequently, relying solely on accuracy score and ROC score may not be sufficient for our prediction analysis.

6.3.5 Train and test metrices

train_predictions <- predict(Logit, newdata = data_train, type = "response")

train_predictions_class <- ifelse(train_predictions > 0.49, 1, 0)

train_conf_matrix <- table(Predicted = train_predictions_class, Actual = data_train$loan_status)

train_accuracy <- sum(diag(as.matrix(train_conf_matrix))) / sum(train_conf_matrix)
print(paste("Training Accuracy:", round(train_accuracy * 100, 2), "%"))
## [1] "Training Accuracy: 91.45 %"
test_predictions <- predict(Logit, newdata = data_test, type = "response")

test_predictions_class <- ifelse(test_predictions > 0.49, 1, 0)

test_conf_matrix <- table(Predicted = test_predictions_class, Actual = data_test$loan_status)


test_accuracy <- sum(diag(as.matrix(test_conf_matrix))) / sum(test_conf_matrix)
print(paste("Test Accuracy:", round(test_accuracy * 100, 2), "%"))
## [1] "Test Accuracy: 93.09 %"
train_precision <- train_conf_matrix[2, 2] / sum(train_conf_matrix[, 2])
print(paste("Training Precision:", round(train_precision*100, 2), "%"))
## [1] "Training Precision: 88.92 %"
train_recall <- train_conf_matrix[2, 2] / sum(train_conf_matrix[2, ])
print(paste("Training Recall:", round(train_recall*100, 2), "%"))
## [1] "Training Recall: 88.51 %"
test_precision <- test_conf_matrix[2, 2] / sum(test_conf_matrix[, 2])
print(paste("Test Precision:", round(test_precision*100, 2), "%"))
## [1] "Test Precision: 90.68 %"
test_recall <- test_conf_matrix[2, 2] / sum(test_conf_matrix[2, ])
print(paste("Test Recall:", round(test_recall*100, 2), "%"))
## [1] "Test Recall: 90.97 %"

In the logistic regression model, the following performance metrics were observed:

  • Training Accuracy: 91.45%
  • Test Accuracy: 93.09%
  • Training Precision: 88.92%
  • Training Recall: 88.51%
  • Test Precision: 90.68%
  • Test Recall: 90.97%

These metrics provide an overview of the model’s ability to correctly predict loan approval status. The high accuracy and precision scores indicate a strong predictive performance. Additionally, balanced recall scores suggest that the model effectively captures both approved and rejected instances. This comprehensive evaluation demonstrates the logistic regression model’s robustness in making accurate predictions on both the training and test datasets.

6.3.6 Confusion matrix

library("regclass")
# confusion_matrix(admitLogit)
ezids::xkabledply( confusion_matrix(Logit), title = "Confusion matrix from Logit Model" )
Confusion matrix from Logit Model
Predicted Approved Predicted Rejected Total
Actual Approved 1979 145 2124
Actual Rejected 150 1141 1291
Total 2129 1286 3415

This confusion matrix provides a detailed breakdown of the model’s predictions and actual outcomes for the two classes (Approved and Rejected). It includes values for True Positives (1979), False Positives (145), False Negatives (150), and True Negatives (1141). These metrics are useful for assessing the model’s performance, calculating various evaluation measures such as precision, recall, and accuracy.

6.3.7 Receiver-Operator-Characteristic (ROC) curve and Area-Under-Curve (AUC)

Receiver-Operator-Characteristic (ROC) curve and Area-Under-Curve (AUC) measures the true positive rate (or sensitivity) against the false positive rate (or specificity). The area-under-curve is always between 0.5 and 1. Values higher than 0.8 is considered good model fit.

library("pROC") 
prob=predict(Logit, type = "response" )
data_train$prob=prob
h <- roc(loan_status~prob, data=data_train)
roc_curve=auc(h) # area-under-curve prefer 0.8 or higher.
k_logit=roc_curve
#plot(h)
plot(h, main = "ROC Curve", col = "gold", lwd = 2)
text(0.8, 0.2, paste("AUC =", round(auc(h), 3)), col = "black")

# unloadPkg("pROC")

6.3.8 McFadden pseudo R-squared

NullLogit <- glm(loan_status ~ 1, data = data, family = "binomial")
mcFadden = 1 - logLik(Logit)/logLik(NullLogit)
mcFadden

In logistic regression, the log likelihood value is 0.729 with 8 degrees of freedom. This is used to calculate McFadden’s pseudo R-squared, indicating how well the model fits the data compared to a simple intercept-only model. A higher pseudo R-squared value suggests a better fit, but there’s no universal threshold. The log likelihood of 0.729 contributes to assessing the goodness-of-fit in our logistic regression analysis.

6.4 Data preprocessing

6.4.1 Factoring the data

data_train$self_employed<-as.numeric(data_train$self_employed)
data_test$self_employed<-as.numeric(data_test$self_employed)
str(data_train)

6.5 KNN Model creation

6.5.1 Finding the best K value

library("class")
library("ggplot2")
chooseK = function(k, train_set, val_set, train_class, val_class){
  
  set.seed(1)
  class_knn = knn(train = train_set,    
                  test = val_set,       
                  cl = train_class,     
                  k = k) 
  
  tab = table(class_knn, val_class)
  
  accu = sum(tab[row(tab) == col(tab)]) / sum(tab)                         
  cbind(k = k, accuracy = accu)
}


knn_different_k = sapply(seq(1, 21, by = 2),  
                         function(x) chooseK(x, 
                                             train_set = data_train[, c("no_of_dependents","self_employed" ,"income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],
                                             val_set = data_test[, c("no_of_dependents","self_employed" ,"income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],
                                             train_class = data_train[, "loan_status"],
                                             val_class = data_test[, "loan_status"]))

str(knn_different_k)
##  num [1:2, 1:11] 1 0.542 3 0.573 5 ...
knn_different_k = data.frame(k = knn_different_k[1,],
                             accuracy = knn_different_k[2,])



ggplot(knn_different_k,
       aes(x = k, y = accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3) + 
  labs(title = "accuracy vs k")+theme_minimal()

6.6 KNN Evaluation metrices

6.6.1 For training

train_predictions <- knn(train = data_train[, c("no_of_dependents","self_employed","income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],
                          test = data_train[, c("no_of_dependents","self_employed","income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],
                          cl = data_train[, "loan_status"],
                          k = 18)


train_conf_matrix <- table(Predicted = train_predictions, Actual = data_train$loan_status)

print(train_conf_matrix)
##           Actual
## Predicted  Approved Rejected
##   Approved     1855      980
##   Rejected      269      311
train_accuracy <- sum(diag(as.matrix(train_conf_matrix))) / sum(train_conf_matrix)
print(paste("Training Accuracy:", round(train_accuracy * 100, 2), "%"))
## [1] "Training Accuracy: 63.43 %"

6.6.2 For testing

library("class")
set.seed(1)
bank_18NN = knn(train = data_train[,c("no_of_dependents","self_employed","income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],test =data_test[,c("no_of_dependents","self_employed","income_annum","loan_amount","loan_term","cibil_score","luxury_assets_value","bank_asset_value")],cl = data_train[, "loan_status"],k = 18)                             
str(bank_18NN)
##  Factor w/ 2 levels "Approved","Rejected": 1 1 1 1 1 2 1 1 2 2 ...
length(bank_18NN)
## [1] 854
table(bank_18NN)
## bank_18NN
## Approved Rejected 
##      704      150
conf_matrix<-table(Predicted = bank_18NN, Actual = data_test$loan_status)
accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Test Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Test Accuracy: 61.12 %"

6.6.3 Confusion matrix

library("gmodels")
IRISPREDCross <- CrossTable(data_test[,"loan_status"], bank_18NN, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  854 
## 
##  
##                            | bank_18NN 
## data_test[, "loan_status"] |  Approved |  Rejected | Row Total | 
## ---------------------------|-----------|-----------|-----------|
##                   Approved |       452 |        80 |       532 | 
##                            |     0.850 |     0.150 |     0.623 | 
##                            |     0.642 |     0.533 |           | 
##                            |     0.529 |     0.094 |           | 
## ---------------------------|-----------|-----------|-----------|
##                   Rejected |       252 |        70 |       322 | 
##                            |     0.783 |     0.217 |     0.377 | 
##                            |     0.358 |     0.467 |           | 
##                            |     0.295 |     0.082 |           | 
## ---------------------------|-----------|-----------|-----------|
##               Column Total |       704 |       150 |       854 | 
##                            |     0.824 |     0.176 |           | 
## ---------------------------|-----------|-----------|-----------|
## 
## 

6.6.4 Testing results

library("caret") 
cm = confusionMatrix(bank_18NN, reference = as.factor(data_test[, "loan_status"]) )
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Approved Rejected
##   Approved      452      252
##   Rejected       80       70
##                                         
##                Accuracy : 0.611         
##                  95% CI : (0.578, 0.644)
##     No Information Rate : 0.623         
##     P-Value [Acc > NIR] : 0.771         
##                                         
##                   Kappa : 0.075         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.850         
##             Specificity : 0.217         
##          Pos Pred Value : 0.642         
##          Neg Pred Value : 0.467         
##              Prevalence : 0.623         
##          Detection Rate : 0.529         
##    Detection Prevalence : 0.824         
##       Balanced Accuracy : 0.534         
##                                         
##        'Positive' Class : Approved      
## 
precision <- cm$byClass["Precision"]
recall <- cm$byClass["Recall"]

print(paste("Precision:", round(precision, 2)))
## [1] "Precision: 0.64"
print(paste("Recall:", round(recall, 2)))
## [1] "Recall: 0.85"

6.6.5 AUC ROC curve for KNN

library("class")
library("pROC")

set.seed(1)

roc_curve <- roc(ifelse(data_test[, "loan_status"] == "Approved", 1, 0), ifelse(bank_18NN == "Approved", 1, 0))

plot(roc_curve, main = "ROC Curve", col = "gold", lwd = 2)

text(0.8, 0.2, paste("AUC =", round(auc(roc_curve), 3)), col = "black")

abline(a = 0, b = 1, col = "gray", lty = 2)

k<-auc(roc_curve)

6.7 Decision Tree model

6.7.1 Training

library(rpart)

tree_model <- rpart(loan_status ~ no_of_dependents + self_employed + income_annum +
                      loan_amount + loan_term + cibil_score + luxury_assets_value +
                      bank_asset_value, data = data_train, method = "class")

tree_predictions <- predict(tree_model, newdata = data_train, type = "class")

conf_matrix<-table(tree_predictions, data_train$loan_status)

accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Train Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Train Accuracy: 96.81 %"

6.7.2 Testing

library(rpart)

tree_model <- rpart(loan_status ~ no_of_dependents + self_employed + income_annum +
                      loan_amount + loan_term + cibil_score + luxury_assets_value +
                      bank_asset_value, data = data_train, method = "class")

tree_predictions <- predict(tree_model, newdata = data_test, type = "class")

conf_matrix<-table(tree_predictions, data_test$loan_status)

accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Test Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Test Accuracy: 96.49 %"

6.7.3 Testing results

library("caret") 
cm = confusionMatrix(tree_predictions, reference = as.factor(data_test[, "loan_status"]) )
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Approved Rejected
##   Approved      529       27
##   Rejected        3      295
##                                        
##                Accuracy : 0.965        
##                  95% CI : (0.95, 0.976)
##     No Information Rate : 0.623        
##     P-Value [Acc > NIR] : < 2e-16      
##                                        
##                   Kappa : 0.924        
##                                        
##  Mcnemar's Test P-Value : 2.68e-05     
##                                        
##             Sensitivity : 0.994        
##             Specificity : 0.916        
##          Pos Pred Value : 0.951        
##          Neg Pred Value : 0.990        
##              Prevalence : 0.623        
##          Detection Rate : 0.619        
##    Detection Prevalence : 0.651        
##       Balanced Accuracy : 0.955        
##                                        
##        'Positive' Class : Approved     
## 
precision <- cm$byClass["Precision"]
recall <- cm$byClass["Recall"]

print(paste("Precision:", round(precision, 2)))
## [1] "Precision: 0.95"
print(paste("Recall:", round(recall, 2)))
## [1] "Recall: 0.99"

6.7.4 AUC ROC curve of decision trees

library(pROC)
tree_predictions<-as.numeric(tree_predictions)
data_test$loan_status<-as.numeric(data_test$loan_status)

roc_curve2 <- roc(data_test$loan_status, tree_predictions)

plot(roc_curve2, main = "ROC Curve", col = "gold", lwd = 2)

auc_value <- auc(roc_curve2)
text(0.8, 0.2, paste("AUC =", round(auc_value, 3)), col = "black", cex = 1.2)

cat("AUC:", auc_value, "\n")
## AUC: 0.955

6.7.5 AUC Scores

print(paste("AUC score of KNN", k))
## [1] "AUC score of KNN 0.533507682249101"
print(paste("AUC score of decision Tress", auc_value))
## [1] "AUC score of decision Tress 0.95525498528931"
print(paste("AUC score of Logistic regressor", k_logit))
## [1] "AUC score of Logistic regressor 0.96766802184032"

6.8 Random forest

library(randomForest)

rf_model <- randomForest(loan_status ~ no_of_dependents + income_annum + loan_amount +
                          loan_term + cibil_score + luxury_assets_value + bank_asset_value,
                          data = data_train, ntree = 500, importance = TRUE)

print(rf_model)
## 
## Call:
##  randomForest(formula = loan_status ~ no_of_dependents + income_annum +      loan_amount + loan_term + cibil_score + luxury_assets_value +      bank_asset_value, data = data_train, ntree = 500, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 1.58%
## Confusion matrix:
##          Approved Rejected class.error
## Approved     2108       16     0.00753
## Rejected       38     1253     0.02943

6.8.1 Feature Importance Summary

feature_importance <- importance(rf_model)
print(feature_importance)
##                     Approved Rejected MeanDecreaseAccuracy MeanDecreaseGini
## no_of_dependents        1.33     4.34                 3.74             15.8
## income_annum           15.75    14.63                21.43             37.2
## loan_amount            26.23    17.25                32.05             58.8
## loan_term              87.47    80.78               103.77             95.8
## cibil_score           383.23   399.68               440.78           1328.5
## luxury_assets_value    12.77    10.13                16.37             38.7
## bank_asset_value       10.79     9.59                14.86             29.8
  1. Loan Term (loan_term):
    • Highest importance in both accuracy and Gini impurity reduction.
  2. CIBIL Score (cibil_score):
    • Significantly important for predictive accuracy and reducing impurity.
  3. Loan Amount (loan_amount):
    • Shows substantial importance in both metrics.
  4. Income (income_annum) and Luxury Assets (luxury_assets_value):
    • Moderately important.
  5. Bank Asset Value (bank_asset_value):
    • Relatively lower importance.
  6. Number of Dependents (no_of_dependents):
    • Appears least impactful on model performance.

6.9 Model metrics random forest

6.9.1 Training

rf_predictions <- predict(rf_model, newdata = data_train)
conf_matrix<-table(rf_predictions, data_train$loan_status)

accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Train Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Train Accuracy: 100 %"

6.9.2 Testing

rf_predictions <- predict(rf_model, newdata = data_test, type = "class")

conf_matrix<-table(rf_predictions, data_test$loan_status)

accuracy <- sum(diag(as.matrix(conf_matrix))) / sum(conf_matrix)
print(paste("Test Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Test Accuracy: 98.59 %"

6.9.3 AUC ROC Curve

library(pROC)
rf_predictions<-as.numeric(rf_predictions)
data_test$loan_status<-as.numeric(data_test$loan_status)

roc_curve2 <- roc(data_test$loan_status, rf_predictions)

plot(roc_curve2, main = "ROC Curve", col = "gold", lwd = 2)

auc_value <- auc(roc_curve2)
text(0.8, 0.2, paste("AUC =", round(auc_value, 3)), col = "black", cex = 1.2)

cat("AUC:", auc_value, "\n")
## AUC: 0.983

6.10 Model Result chart

auc_values <- c(0.5335, 0.9553, 0.9677,0.9832)
model_names <- c("KNN", "Decision Tree", "Logistic Regression","Random Forest")  

plot(auc_values, type = "o", col = "gold", pch = 16, lty = 1,
     xlab = "Model", ylab = "AUC Score",
     main = "AUC Score Progression", xaxt = "n") 

grid()

axis(side = 1, at = 1:length(model_names), labels = model_names)

abline(h = 0.8, col = "red", lty = 2)